# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# File Name          : _survey_exp_toolkit.R
# Programmer Name    : Luis Campos
#                     soyluiscampos@gmail.com
#
# Purpose            : This file has the various functions for conducting our
#						post-stratified and re-weighted estimations of treatment
#						effect.
#
# Input              : None
# Output             : None
# Usage              : 
# 
# References         : None
#
#
# Platform           : R
# Version            : v3.3.0
# Date               : 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Replication Information:
#   We do expect differences when running this code on different systems 
#   (and with different seeds), but the differences all tend to be quite 
#   small and the general trends remain. If you would like to replicate 
#   the exact numbers and figures used in the article, in the readme you 
#   will find the session info. Please install and load all package 
#   versions (including the R version) to ensure the exact replication. 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #


###############################################################
##  Functions for analyzing an observed data set
###############################################################


## Calculate tau-hat for the many different tau-hat in our paper
##
## Params
## Yobs observed outcome
## Tx treatment vector
## wts weights (= to pi-bar/pi_i)
## K = number of strata.  If K <= 1 DON'T STRATIFY
calc.txs = function( Yobs, Tx, wts, K=5 ) {
	
	u = Yobs * wts
	Z = sum(wts)
	n = length(Yobs)
	n1 = sum(Tx)
	v = u * n / Z
	Tx = Tx == 1  # change Tx to logical for indexing
	
	stopifnot( (n1 > 0) && ( n1 < n ) )
	
	# SATE
	tau.sate = mean( Yobs[Tx] ) - mean( Yobs[!Tx] ) 
	
	# weighted simple-difference
	tau.sd = mean( u[Tx] ) - mean( u[!Tx] )
	
	# hajek
	tau.h = mean( v[Tx] ) - mean( v[!Tx] )

	# double-hajek
	Z1 = sum(wts[Tx])
	Z0 = sum(wts[!Tx])
	tau.hh = sum(u[Tx])/Z1 - sum(u[!Tx]) / Z0
	
	# post-stratified versions
	
	if ( K > 1 ) {
		# make strata
		strats = stratify( wts, K=K )
		
		tau.ps = calc.tau.ps( u, Tx, strats )
		
		tau.ps.h = calc.tau.ps( v, Tx, strats )
	
		# hajek in each of the strata  --- more computational.  Sad for the sim studies.  Optimize?
		df = data.frame( Yobs = Yobs, u=u, Tx=Tx, wts=wts, b=strats )
		sstat = ddply( df, .(b), summarize, Zk = sum(wts),
											Z1k = sum(wts[Tx]),
											Z0k = sum(wts[!Tx]),
											u1T = sum(u[Tx]),
											u0T = sum(u[!Tx]) )
		tau.k = sstat$u1T/sstat$Z1k - sstat$u0T/sstat$Z0k 
		tau.ps.hh = sum( tau.k * sstat$Zk ) / Z
	} else {
		tau.ps = tau.ps.h = tau.ps.hh = NA
	}
		

	c( tau.sate=tau.sate, 								# simple
		tau.sd=tau.sd, tau.h=tau.h, tau.hh = tau.hh, 	# weighted, no adjustment
		tau.ps = tau.ps, tau.ps.h = tau.ps.h, 			# post-stratified
		tau.ps.hh = tau.ps.hh,  						# post-stratified double-hajek
		tau.star1 = tau.sate - tau.hh,                   # SATE v PATE
		tau.star2 = tau.sate - tau.ps.hh,                   # SATE v PATE
		n=n, Z=Z )	
}




## Calculate tau-hat using OLS adjustment
##
## Params
## formula: Formula that MUST have Yobs as outcome and Tx as covariate.
## data  data.frame with columns Yobs, Tx, wts 
##    Also can have covariates
## In the data.frame we have:
##     Yobs observed outcome
##     Tx treatment vector
##     wts weights (= to pi-bar/pi_i)
## K = number of strata.  If K <= 1 DON'T STRATIFY
##
## Examples:
## # This is post-stratified est from above
## calc.tx.ols( Yobs ~ Tx * strata, Yobs, Tx, wts, K=5 )  
##
## # This is simple-difference
## calc.tx.ols( Yobs ~ Tx, Yobs, Tx, wts, K=5 )
##
## # This uses covariate blurg to adjust
## calc.tx.ols( Yobs ~ Tx * (strata + blurg), Yobs, Tx, wts, K=5 )
calc.txs.ols = function( formula, data, resp.var="Yobs", tx.var="Tx", wts.var="wts", K=5, return.model=FALSE ) {

	# extract info from formula
	terms = terms(formula)
	vars = attr( terms,"variables" )
	#resp.var = vars[ attr( terms,"response" ) ]
	Yobs = data[[resp.var]]
	Tx = data[[tx.var]]
	wts = data[[wts.var]]
			
	u = Yobs * wts
	Z = sum(wts)
	n = length(Yobs)
	n1 = sum(Tx)
	v = u * n / Z
	
	stopifnot( (n1 > 0) && ( n1 < n ) )

	dat = data
	
	# ols versions
	if ( K > 1 ) {
		# make strata
		dat$strata = stratify( wts, K=K )
	} 
	
	get.tx = function( ll, ndat ) {
		dat1 = ndat
		dat1[tx.var] = 1
		dat0 = ndat
		dat0[tx.var] = 0
		mean( predict( ll, dat1 ) - predict( ll, dat0 ) )
	}
	
	tau.sate.lm = lm( formula, data=dat, x=return.model )
	tau.sate = get.tx( tau.sate.lm, dat )
	
	dat[resp.var] = u
	tau.sd.lm = lm( formula, data=dat )
	tau.sd = get.tx( tau.sd.lm, dat )
	
	dat[resp.var] = v
	tau.h.lm = lm( formula, data=dat )
	tau.h = get.tx( tau.h.lm, dat )
	
	resp = c( tau.ols.sate=tau.sate, tau.ols.sd=tau.sd, tau.ols.h=tau.h )
	if ( return.model ) {
		list( resp=resp, model=tau.sate.lm, data=dat )
	} else {
		resp
	}
}




###############################################################
## Functions for analyzing estimators when given the
## full population parameters
###############################################################

# y are 
calc.var.tau.sd = function( y1, y0, p, n ) {
	
	mu = mean(y)
	
	se = (1/n) * mean( (mean( p )/p) * (y - mu)^2 )
	
	se
}



###############################################################
## Functions for generating bootstrap standard
## error estimates
###############################################################


# @param K number of strata for post-stratification.
calc.boot.se = function( Yobs, Tx, w, K, R = 1000 ) {
	edat = data.frame( Yobs=Yobs, Tx=Tx, w=w )
	
	# bootstrap standard errors
	SErep = replicate( R, {
		
		edat.star = sample( nrow(edat), replace=TRUE  )
		edat.star = edat[ edat.star, ]
		calc.txs( edat.star$Yobs, edat.star$Tx, edat.star$w, K=K )
		
	} )
	
	
	rs  = melt( SErep )
	rs = subset( rs, Var1 != "Z" & Var1 != "n" )
	
	res = ddply( rs, .(Var1), summarize,
			n.na = sum( is.na( value ) ),
			mean = mean( value, na.rm=TRUE ),
			SE = sd( value, na.rm=TRUE ) )
	
	res
}


###############################################################
## Functions for conducting fake experiments on fake
## populations for simulation studies
###############################################################

# bernoulli sampling function
# Sample to get expected n units (if n not null)
# return a vector of whether unit is in the sample or not.  (logical vector)
bern.sample = function( pies ) {
	unif = runif( length(pies) )
	S = (unif <= pies)
}



# Given a complete set of potential outcomes for an already selected sample
# in dataframe dat, run an experiment and 
# generate the observed outcomes in Y and treatment vector
# Tx
randomizeDat = function( dat, binom.assignment=FALSE, p =0.5) {
	n = nrow(dat)
	if ( binom.assignment ) {
			tx = sample( c(TRUE,FALSE), n, prob=c(p, 1-p), replace=TRUE )
		} else {
			tx = sample( n ) <= n*p
		}
	dat$Tx = tx
	dat$Yobs = ifelse( dat$Tx, dat$Y1, dat$Y0 )
	dat
}


# calc the self-weighted and post-stratified treatment
# effect estimators given outcome variable, Tx, and b
calc.tau.ps = function( Y, Tx, b ) {
	stopifnot( !is.null(b) & !is.null(Tx) & !is.null(Y))
	n = length(Y)

	mns = tapply( Y, list( b, Tx ), mean )
	tx <- sum( (mns[,2] - mns[,1]) * table(b)/n )

	tx
}


## Calculate the strata given a vector of realized weights
stratify = function( wts, K ) {
	cut( wts, breaks=quantile(wts,(0:K)/K), include.lowest=TRUE )
}


calc.tau.sd.SE = function( Yobs, Tx, wts ) {	
	u = Yobs * wts
	n = length(Tx)
	n1 = sum(Tx)
	n0 = n - n1
	
	mu1.hat = mean( u[Tx] )
	mu0.hat = mean( u[!Tx] )
	
	stop( "Not finished being implemented." )	
}





###############################################################
##  Testing and demo code
###############################################################


if ( FALSE ) {
	
	N = 10000
	K = 7
	n = 100
	
	# Generate some data (some huge population)
	dat = data.frame( Y1 = 1:N, Y0 = runif(N,1,100), w=runif(N,1,20) )
	dat$Y0 = dat$Y0 + dat$w
	dat$Y1 = dat$Y0 + 10*sqrt( (max(dat$w) - dat$w)) + rnorm(N)
	dat$t = dat$Y1 - dat$Y0
	dat$pi = 1 / dat$w 
	dat$pi = n * dat$pi / sum(dat$pi)
	dat$w = mean(dat$pi) / dat$pi
	
	# look at it
	plot( dat[ sample(nrow(dat), 1000 ), ] )
	
	tau = mean(dat$Y1) - mean(dat$Y0)
	cat( "tau = ", tau, "\n" )
	
	summary(dat)
	max(dat$w) / min(dat$w)
	
	samp.ind = bern.sample( dat$pi )
	edat = dat[ samp.ind, ] 
	nrow(edat)
	edat = randomizeDat( edat )
	
	
	# calculate several estimates of tx effect
	calc.txs( edat$Yobs, edat$Tx, edat$w, K=K )
	
	
	# look at relationship of variables
	edat$u1 = edat$Y1 * edat$w
	edat$u0 = edat$Y0 * edat$w
	edat$b = stratify( edat$w, K )
	plot( edat[c("Y1","Y0","w","u1","u0","b")] )


		
}








